In [2]:
import pandas as pd
import numpy as np
import altair as alt
import folium
from folium.plugins import MarkerCluster, HeatMap
# Load data
data_url = "https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD"
df = pd.read_csv(data_url)
# Display dataset info
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 28562 entries, 0 to 28561 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 INCIDENT_KEY 28562 non-null int64 1 OCCUR_DATE 28562 non-null object 2 OCCUR_TIME 28562 non-null object 3 BORO 28562 non-null object 4 LOC_OF_OCCUR_DESC 2966 non-null object 5 PRECINCT 28562 non-null int64 6 JURISDICTION_CODE 28560 non-null float64 7 LOC_CLASSFCTN_DESC 2966 non-null object 8 LOCATION_DESC 13585 non-null object 9 STATISTICAL_MURDER_FLAG 28562 non-null bool 10 PERP_AGE_GROUP 19218 non-null object 11 PERP_SEX 19252 non-null object 12 PERP_RACE 19252 non-null object 13 VIC_AGE_GROUP 28562 non-null object 14 VIC_SEX 28562 non-null object 15 VIC_RACE 28562 non-null object 16 X_COORD_CD 28562 non-null float64 17 Y_COORD_CD 28562 non-null float64 18 Latitude 28503 non-null float64 19 Longitude 28503 non-null float64 20 Lon_Lat 28503 non-null object dtypes: bool(1), float64(5), int64(2), object(13) memory usage: 4.4+ MB
Out[2]:
| INCIDENT_KEY | OCCUR_DATE | OCCUR_TIME | BORO | LOC_OF_OCCUR_DESC | PRECINCT | JURISDICTION_CODE | LOC_CLASSFCTN_DESC | LOCATION_DESC | STATISTICAL_MURDER_FLAG | ... | PERP_SEX | PERP_RACE | VIC_AGE_GROUP | VIC_SEX | VIC_RACE | X_COORD_CD | Y_COORD_CD | Latitude | Longitude | Lon_Lat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 244608249 | 05/05/2022 | 00:10:00 | MANHATTAN | INSIDE | 14 | 0.0 | COMMERCIAL | VIDEO STORE | True | ... | M | BLACK | 25-44 | M | BLACK | 986050.000 | 214231.000000 | 40.754692 | -73.993500 | POINT (-73.9935 40.754692) |
| 1 | 247542571 | 07/04/2022 | 22:20:00 | BRONX | OUTSIDE | 48 | 0.0 | STREET | (null) | True | ... | (null) | (null) | 18-24 | M | BLACK | 1016802.000 | 250581.000000 | 40.854402 | -73.882330 | POINT (-73.88233 40.854402) |
| 2 | 84967535 | 05/27/2012 | 19:35:00 | QUEENS | NaN | 103 | 0.0 | NaN | NaN | False | ... | NaN | NaN | 18-24 | M | BLACK | 1048632.000 | 198262.000000 | 40.710634 | -73.767773 | POINT (-73.76777349199995 40.71063412500007) |
| 3 | 202853370 | 09/24/2019 | 21:00:00 | BRONX | NaN | 42 | 0.0 | NaN | NaN | False | ... | M | UNKNOWN | 25-44 | M | BLACK | 1014493.000 | 242565.000000 | 40.832417 | -73.890714 | POINT (-73.89071440599997 40.832416753000075) |
| 4 | 27078636 | 02/25/2007 | 21:00:00 | BROOKLYN | NaN | 83 | 0.0 | NaN | NaN | False | ... | M | BLACK | 25-44 | M | BLACK | 1009149.375 | 190104.703125 | 40.688443 | -73.910219 | POINT (-73.91021857399994 40.68844345900004) |
5 rows × 21 columns
In [3]:
# Correcting data types and extracting valuable info
df["OCCUR_DATE"] = pd.to_datetime(df["OCCUR_DATE"])
df["Hour"] = df["OCCUR_TIME"].str[:2].astype(int)
df["DayOfWeek"] = df["OCCUR_DATE"].dt.day_name()
df["Month"] = df["OCCUR_DATE"].dt.month
df.drop(columns=["OCCUR_TIME"], inplace=True)
# Bin Month into Season
season_mapping = {0: 'Winter', 1: 'Spring', 2: 'Summer', 3: 'Fall'}
df["Season"] = (df["Month"] % 12 // 3).map(season_mapping)
# Bin Hour into TimeofDay
df["TimeofDay"] = pd.cut(
df["Hour"],
bins=[-1, 4, 11, 16, 20, 24],
labels=["Night", "Morning", "Afternoon", "Evening", "Night"],
right=True,
ordered=False
)
In [4]:
df["STATISTICAL_MURDER_FLAG"] = df["STATISTICAL_MURDER_FLAG"].astype(int)
# Murder to Non-Murder Counts and Ratio
murder_counts = df['STATISTICAL_MURDER_FLAG'].value_counts().reset_index()
murder_counts.columns = ['MurderFlag', 'Count']
murder_counts['Proportion'] = murder_counts['Count'] / murder_counts['Count'].sum()
base_chart = alt.Chart(murder_counts).encode(
x=alt.X('MurderFlag:N', title='Murder (1) or Non-Murder (0)'),
tooltip=[
alt.Tooltip('MurderFlag:N', title='MurderFlag'),
alt.Tooltip('Count:Q', title='Incident Count'),
alt.Tooltip('Proportion:Q', format='.2%', title='Proportion of Total Incidents')
]
)
# Bar chart
bar_chart = base_chart.mark_bar().encode(
y=alt.Y('Count:Q', title='Incident Count', axis=alt.Axis(titleColor='blue')),
color=alt.Color('MurderFlag:N', scale=alt.Scale(scheme='category10'), legend=None)
)
# Add invisible line plot to add proportion axis
line_chart = base_chart.mark_line(opacity=0).encode(
y=alt.Y('Proportion:Q', title='Proportion of Total Incidents (%)',
axis=alt.Axis(format='.0%', titleColor='orange'))
)
# Combine everything
murder_counts = alt.layer(bar_chart, line_chart).resolve_scale(
y='independent'
).properties(
title='Murder vs. Non-Murder Incidents',
width=400,
height=300
)
murder_counts
Out[4]:
In [5]:
# Count Per Season and Boro
season_borough_counts = df.groupby(["Season", "BORO"]).size().reset_index(name="Count")
season_borough_counts["Proportion"] = season_borough_counts.groupby("Season")["Count"].transform("sum") / df.shape[0]
bar_chart = alt.Chart(season_borough_counts).mark_bar().encode(
x=alt.X("Season:N", title="Season", sort=['Winter', 'Spring', 'Summer', 'Fall']),
y=alt.Y("Count:Q", title="Incident Count", axis=alt.Axis(titleColor='blue')),
color=alt.Color("BORO:N", legend=alt.Legend(title="Borough")),
tooltip=[
alt.Tooltip("Season:N", title="Season"),
alt.Tooltip("BORO:N", title="Borough"),
alt.Tooltip("Count:Q", title="Incident Count"),
alt.Tooltip("Proportion:Q", format=".2%", title="Proportion of Total (Season)")
]
)
season_borough_chart = bar_chart.properties(
title="Seasonal Trends in Incident Counts by Borough",
width=800,
height=400
)
season_borough_chart
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False) C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[5]:
In [6]:
# Get the start date
df["Month"] = df["OCCUR_DATE"].dt.to_period("M").dt.start_time
# Incident Count by Month and Season
monthly_counts = df.groupby(["Month", "Season"]).size().reset_index(name="Count")
# Dates of Notable events
event_dates = pd.DataFrame({
"EventDate": pd.to_datetime(["2020-03-01", "2020-06-01", "2021-01-06"]),
"EventDescription": ["COVID-19 Pandemic Start", "George Floyd Protests", "Capitol Riots"]
})
# Base line chart
monthly_chart = alt.Chart(monthly_counts).mark_line().encode(
x=alt.X("Month:T", title="Month"),
y=alt.Y("Count:Q", title="Incident Count"),
tooltip=["Month:T", "Count:Q"]
).properties(
title="Incident Frequency Over Time (Monthly Aggregation)",
width=800,
height=400
)
# Event lines
event_lines = alt.Chart(event_dates).mark_rule(color="red", strokeDash=[5, 3]).encode(
x="EventDate:T",
tooltip=["EventDate:T", "EventDescription:N"]
)
# Seasonality highlights
season_colors = alt.Chart(monthly_counts).mark_area(opacity=0.2).encode(
x="Month:T",
y="Count:Q",
color=alt.Color("Season:N", scale=alt.Scale(domain=["Winter", "Spring", "Summer", "Fall"],
range=["#377eb8", "#4daf4a", "#ffcc00", "#a65628"])),
tooltip=["Month:T", "Season:N"]
)
# Combine the layers
monthly_incident_chart = (monthly_chart + season_colors + event_lines).interactive()
monthly_incident_chart
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False) C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[6]:
In [7]:
# Aggregate incidents by time of day and murder vs non-murder
time_of_day_counts = df.groupby(['Hour', 'STATISTICAL_MURDER_FLAG']).size().reset_index(name='Count')
time_of_day_counts['STATISTICAL_MURDER_FLAG'] = time_of_day_counts['STATISTICAL_MURDER_FLAG'].replace(
{0: 'Non-Murder', 1: 'Murder'}
)
base_chart = alt.Chart(time_of_day_counts).encode(
x=alt.X('Hour:O', title='Hour of Day')
)
area_chart = base_chart.mark_area(opacity=0.7).encode(
y=alt.Y('Count:Q', title='Incident Count'),
color=alt.Color('STATISTICAL_MURDER_FLAG:N', legend=alt.Legend(title="Incident Type")),
tooltip=['Hour:O', 'STATISTICAL_MURDER_FLAG:N', 'Count:Q']
)
time_of_day_murdernonmurder_chart = area_chart.properties(
title='Time of Day Patterns by Incident Type',
width=700,
height=400
)
time_of_day_murdernonmurder_chart
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[7]:
In [8]:
# Counts by Hour and DayofWeek
hour_day_counts = df.groupby(["Hour", "DayOfWeek"]).size().reset_index(name="Count")
# Sort DayofWeek
day_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
# Heatmap
heatmap = alt.Chart(hour_day_counts).mark_rect().encode(
x=alt.X("Hour:O", title="Hour of Day"),
y=alt.Y("DayOfWeek:O", sort=day_order, title="Day of Week"),
color=alt.Color("Count:Q", scale=alt.Scale(scheme="viridis"), title="Incident Count"),
tooltip=[
alt.Tooltip("Hour:O", title="Hour of Day"),
alt.Tooltip("DayOfWeek:O", title="Day of Week"),
alt.Tooltip("Count:Q", title="Incident Count")
]
)
heatmap.properties(
title="Incident Frequency by Hour and Day of Week",
width=800,
height=400
)
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[8]:
In [9]:
# Define Age Bins
included_age_groups = ["<18", "18-24", "25-44", "45-64", "65+"] # Define age group order
df["AGE_GROUP"] = df["VIC_AGE_GROUP"].apply(lambda x: x if x in included_age_groups else "Other")
# Count incidents by age group
age_group_counts = df["AGE_GROUP"].value_counts().reset_index()
age_group_counts.columns = ["Age Group", "Count"]
age_group_counts["Proportion"] = age_group_counts["Count"] / age_group_counts["Count"].sum()
# Sort age
age_group_counts["Age Group"] = pd.Categorical(
age_group_counts["Age Group"], categories=included_age_groups + ["Other"], ordered=True
)
age_group_chart = alt.Chart(age_group_counts).mark_bar().encode(
x=alt.X("Age Group:N", title="Age Group", sort=included_age_groups + ["Other"]),
y=alt.Y("Count:Q", title="Incident Count", axis=alt.Axis(titleColor="blue")),
color=alt.Color("Age Group:N", legend=None),
tooltip=[
alt.Tooltip("Age Group:N", title="Age Group"),
alt.Tooltip("Count:Q", title="Incident Count"),
alt.Tooltip("Proportion:Q", format=".2%", title="Proportion of Total Incidents")
]
)
age_group_chart.properties(
title="Incident Distribution by Age Group",
width=800,
height=400
)
Out[9]:
In [10]:
# Prepare Race Grouping
included_races = ["BLACK", "BLACK HISPANIC", "WHITE", "WHITE HISPANIC", "ASIAN/PACIFIC ISLANDER"]
df["RACE_GROUP"] = df["VIC_RACE"].apply(lambda x: x if x in included_races else "Other")
# Incident by race count
race_group_counts = df["RACE_GROUP"].value_counts().reset_index()
race_group_counts.columns = ["Race", "Count"]
race_group_counts["Proportion"] = race_group_counts["Count"] / race_group_counts["Count"].sum()
# Sort highest to lowest
race_group_counts = race_group_counts.sort_values("Count", ascending=False)
sorted_races = race_group_counts["Race"].tolist() # Extract sorted race order
race_group_chart = alt.Chart(race_group_counts).mark_bar().encode(
x=alt.X("Race:N", title="Race", sort=sorted_races),
y=alt.Y("Count:Q", title="Incident Count", axis=alt.Axis(titleColor="blue")),
color=alt.Color("Race:N", legend=None),
tooltip=[
alt.Tooltip("Race:N", title="Race"),
alt.Tooltip("Count:Q", title="Incident Count"),
alt.Tooltip("Proportion:Q", format=".2%", title="Proportion of Total Incidents")
]
)
race_group_chart.properties(
title="Incident Distribution by Race (Ordered by Count)",
width=800,
height=400
)
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[10]:
In [11]:
# Valuecount location description column
location_desc_counts = df["LOCATION_DESC"].value_counts().reset_index()
location_desc_counts.columns = ["Location Description", "Count"]
# Assign null valus as "Unknown" and smaller categories as "Other"
threshold = 100
location_desc_counts["Location Description"] = location_desc_counts["Location Description"].apply(
lambda x: "Unknown" if x in ["(null)", "NONE"]
else "Other" if location_desc_counts.loc[location_desc_counts["Location Description"] == x, "Count"].values[0] < threshold
else x
)
# Recalculate counts and proportions after grouping
location_desc_counts = location_desc_counts.groupby("Location Description").agg({"Count": "sum"}).reset_index()
location_desc_counts["Proportion"] = location_desc_counts["Count"] / location_desc_counts["Count"].sum()
loc_desc_chart = alt.Chart(location_desc_counts).mark_bar().encode(
x=alt.X("Count:Q", title="Count"),
y=alt.Y("Location Description:N", title="Location Description", sort="-x"),
color=alt.Color("Location Description:N", title="Category", legend=alt.Legend(title="Location Description")),
tooltip=[
alt.Tooltip("Location Description:N", title="Location Description"),
alt.Tooltip("Count:Q", title="Count"),
alt.Tooltip("Proportion:Q", format=".2%", title="Proportion of Total Incidents")
]
)
loc_desc_chart.properties(
title="Distribution of Location Descriptions (Binned with Unknown and Other Categories)",
width=800,
height=400
)
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[11]:
In [12]:
# Value counts Location Classification Description column
loc_class_counts = df["LOC_CLASSFCTN_DESC"].value_counts().reset_index()
loc_class_counts.columns = ["Location Classification Description", "Count"]
# Group smaller categories into "Other"
threshold = 100
loc_class_counts["Location Classification Description"] = loc_class_counts["Location Classification Description"].apply(
lambda x: "Other" if loc_class_counts.loc[loc_class_counts["Location Classification Description"] == x, "Count"].values[0] < threshold else x
)
# Recalculate counts and proportions after grouping
loc_class_counts = loc_class_counts.groupby("Location Classification Description").agg({"Count": "sum"}).reset_index()
loc_class_counts["Proportion"] = loc_class_counts["Count"] / loc_class_counts["Count"].sum()
loc_class_chart = alt.Chart(loc_class_counts).mark_bar().encode(
x=alt.X("Count:Q", title="Count"),
y=alt.Y("Location Classification Description:N", title="Location Classification Description", sort="-x"),
color=alt.Color("Location Classification Description:N", title="Category", legend=alt.Legend(title="Location Classification Description")),
tooltip=[
alt.Tooltip("Location Classification Description:N", title="Location Classification Description"),
alt.Tooltip("Count:Q", title="Count"),
alt.Tooltip("Proportion:Q", format=".2%", title="Proportion of Total Incidents")
]
)
loc_class_chart.properties(
title="Distribution of Location Classification Descriptions",
width=800,
height=400
)
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[12]:
In [13]:
shooting_map = folium.Map(location=[40.7128, -74.0060], zoom_start=10)
marker_cluster = MarkerCluster().add_to(shooting_map)
for _, row in df.dropna(subset=["Latitude", "Longitude"]).iterrows():
folium.CircleMarker(
location=[row["Latitude"], row["Longitude"]],
radius=3,
color="red",
fill=True,
fill_color="red"
).add_to(marker_cluster)
shooting_map.save("shooting_map.html")
shooting_map
Out[13]:
Make this Notebook Trusted to load map: File -> Trust Notebook